Analysis of tweets

For this analysis we collected tweets containing the set of public health keywords using the free Twitter Search API, which returns a random portion of matching tweets from the past seven days. This notebook does not include the collection process, but the analysis of the collected tweets.

Select language using one of the following codes:

  • en (English, default option)
  • de (German)
  • es (Spanish)
  • fr (French)
  • pt (Portuguese)
In [66]:
language = 'es'
In [67]:
language_ref = { 'en' : { 'name' : 'English', 'min_coocurrence' : 10, 'min_coocurrence_hashtags' : 2},
                 'de' : { 'name' : 'German', 'min_coocurrence' : 1, 'min_coocurrence_hashtags' : 1},
                 'es' : { 'name' : 'Spanish', 'min_coocurrence' : 1, 'min_coocurrence_hashtags' : 1},
                 'fr' : { 'name' : 'French', 'min_coocurrence' : 1, 'min_coocurrence_hashtags' : 1},
                 'pt' : { 'name' : 'Portuguese', 'min_coocurrence' : 1, 'min_coocurrence_hashtags' : 1},
               }

Loading tweets

In [68]:
import os

tweets_folder = os.path.join("..", "data", "tweets", language)
tweets_climate_filename_prefix = "tweets_climate." + language + "."
tweets_health_filename_prefix = "tweets_health." + language + "."

tweets_climate = {}
tweets_health = {}
In [69]:
import pandas as pd
import csv
    
def load_tweets(filename):
    tweets = {}
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            tweets[row['id']] = row
    return tweets
In [70]:
import time
from ast import literal_eval
from textblob import TextBlob

def get_location(tweet):
    try:
        user = eval(str(tweet['user']))
        location = user['location']
    except:
        location = 'not_specified'
    if location == '':
        location = 'not_specified'
    return location

def get_date(tweet):
    timestamp = tweet['created_at']
    try:
        parsed_timestamp = time.strptime(timestamp, '%a %b %d %H:%M:%S %z %Y')
        month_year = "%s %s" % (parsed_timestamp.tm_mon, parsed_timestamp.tm_year)
    except:
        month_year = 'missing'
    return month_year

def get_text(tweet):
    text = str(tweet['text'])
    return text

def get_user(tweet):
    try:
        user = eval(str(tweet['user']))
        username = user['screen_name']
    except:
        username = 'not_specified'
    return "@" + username

def get_hashtags(tweet):
    hashtags = []
    try:
        hashtags_entities = eval(str(tweet['entities']))['hashtags']
        for hashtag_entity in hashtags_entities:
            hashtags.append("#" + hashtag_entity['text'])
    except:
        pass
    return hashtags

def get_user_mentions(tweet):
    user_mentions = []
    try:
        user_mentions_entities = eval(str(tweet['entities']))['user_mentions']
        for user_mention_entity in user_mentions_entities:
            user_mentions.append("@" + user_mention_entity['screen_name'])
    except:
        pass
    return user_mentions

def get_sentiment(tweet_text):
    if language == 'en':
        sentiment = TextBlob(tweet_text).sentiment
        if sentiment.polarity >= 0.1:
            polarity = 'positive'
        else:
            if sentiment.polarity < 0.1 and sentiment.polarity > -0.1:
                polarity = 'neutral'
            else:
                polarity = 'negative'
        if sentiment.subjectivity >= 0.5:
            subjectivity = 'subjective'
        else:
            subjectivity = 'objective'
    else:
        polarity = 'no_polarity'
        subjectivity = 'no_subjectivity'
    
    return (polarity, subjectivity)

def process_tweet(tweet):
    date = get_date(tweet)
    location = get_location(tweet)
    user = get_user(tweet)
    text = get_text(tweet)
    sentiment = get_sentiment(text)
    hashtags = get_hashtags(tweet)
    user_mentions = get_user_mentions(tweet)
    return {'date' : date, 'location' : location, 'text' : text, 'hashtags' : hashtags, 'user' : user, 'sentiment' : sentiment, 'user_mentions' : user_mentions}
In [71]:
from os import listdir

files = listdir(tweets_folder)
for f in files:
    if tweets_health_filename_prefix in f:
        print("Loading %s" % f)
        for (tweet_id, tweet) in load_tweets(os.path.join(tweets_folder, f)).items():
            tweets_health[tweet_id] = process_tweet(tweet)
#    if tweets_climate_filename_prefix in f:
#        print("Loading %s" % f)
#        for (tweet_id, tweet) in load_tweets(os.path.join(tweets_folder, f)).items():
#            tweets_climate[tweet_id] = process_tweet(tweet)
Loading tweets_health.es.2018-07-27 21:44:11.csv
Loading tweets_health.es.2018-07-28 17:34:53.csv
Loading tweets_health.es.2018-07-30 23:27:25.csv
Loading tweets_health.es.2018-07-29 13:38:20.csv
Loading tweets_health.es.2018-07-30 15:48:15.csv
Loading tweets_health.es.2018-07-30 01:31:41.csv
In [72]:
print("Climate tweets: %d - Health tweets: %d" % (len(tweets_climate), len(tweets_health)))
Climate tweets: 0 - Health tweets: 283475
In [73]:
df_tweets_climate = pd.DataFrame.from_dict(tweets_climate, orient='index')
df_tweets_climate.head(10)
Out[73]:
In [74]:
df_tweets_health = pd.DataFrame.from_dict(tweets_health, orient='index')
df_tweets_health.head(10)
Out[74]:
date location text hashtags user sentiment user_mentions
1022963496181542912 7 2018 los Rios RT @Salud_CZ5: #Guayas | Distrito 09D17 Milagr... [#Guayas] @GALORODRIGUEZ8 (no_polarity, no_subjectivity) [@Salud_CZ5]
1022963498345746432 7 2018 Bogotá, Colombia Qué cinismo de la admón. de @EnriquePenalosa e... [] @Juan_Florez (no_polarity, no_subjectivity) [@EnriquePenalosa, @TransMilenio]
1022963498790342656 7 2018 En Encinas de Esgueva RT @educacyl: ''Protocolo de coordinación del ... [#TDAH] @oalpick (no_polarity, no_subjectivity) [@educacyl, @Salud_JCYL]
1022963499281068032 7 2018 Greenwich ct RT @MashiRafael: Moreno es mentiroso compulsiv... [] @kamyla315 (no_polarity, no_subjectivity) [@MashiRafael]
1022963506134564865 7 2018 Rosario RT @JuvsocialistaSF: Nuestras vidas, nuestros ... [] @juan_silvero (no_polarity, no_subjectivity) [@JuvsocialistaSF]
1022963509057998849 7 2018 not_specified RT @activacongreso: "Si nos quedamos embarazad... [] @carlosbaz2 (no_polarity, no_subjectivity) [@activacongreso]
1022963513688436736 7 2018 not_specified RT @Univ_Ciencia: #EclipseLunar Las mejores f... [#EclipseLunar, #Fotogaleria] @Leevahoogk (no_polarity, no_subjectivity) [@Univ_Ciencia]
1022963515257233413 7 2018 CARTAGENA - COLOMBIA ¿SABÍAS QUE EL LIMÓN CONGELADO PODRÍA SER MÁS ... [] @rovaroas (no_polarity, no_subjectivity) []
1022963515454308354 7 2018 ÜT: 10.593646,-66.990125 RT @Psicovivir: Un chavista me dice "cuando se... [] @avpa80 (no_polarity, no_subjectivity) [@Psicovivir]
1022963515991236608 7 2018 Almagro, Buenos Aires @LinaSarria4 @MarianaLestelle Basta con la men... [] @NahuelTorterolo (no_polarity, no_subjectivity) [@LinaSarria4, @MarianaLestelle]

Keywords

In [75]:
import json
import os
import re

keywords_file = os.path.join("..", "keywords", language + ".json")

climate_dict = []
health_dict = []
compound_terms = []

def normalise_keywords(dictionary): #lowercases and handles compounds
    for i in range(0, len(dictionary)):
        keyword = dictionary[i].lower()
        compound = keyword.replace(' ','_')
        if compound != keyword:
            keyword = compound
            words = tuple(compound.split('_'))
            compound_terms.append(words)
        dictionary[i] = keyword
    return dictionary

def generate_hashtags(dictionary):
    hashtags_dict = []
    for keyword in dictionary:
        hashtags_dict.append("#" + keyword.replace('_',''))
    return hashtags_dict

with open(keywords_file) as f:
    data = json.load(f)

climate_dict = normalise_keywords(data['climate'])
health_dict = normalise_keywords(data['health'])

climate_hashtag_dict = generate_hashtags(climate_dict)
health_hashtag_dict = generate_hashtags(health_dict)
In [76]:
health_dict
Out[76]:
['malaria',
 'diarrea',
 'infección',
 'enfermedad',
 'sars',
 'sarampión',
 'neumonía',
 'epidemia',
 'pandemia',
 'salud_pública',
 'salud',
 'epidemiología',
 'salud',
 'salud',
 'mortalidad',
 'morbilidad',
 'nutrición',
 'enfermedad',
 'enfermedad_infecciosa',
 'ncd',
 'no_transmisible',
 'enfermedad_contagiosa',
 'transmisible',
 'contaminación_del_aire',
 'nutrición',
 'desnutrición',
 'trastorno_mental',
 'retraso_del_crecimiento']
In [77]:
health_hashtag_dict
Out[77]:
['#malaria',
 '#diarrea',
 '#infección',
 '#enfermedad',
 '#sars',
 '#sarampión',
 '#neumonía',
 '#epidemia',
 '#pandemia',
 '#saludpública',
 '#salud',
 '#epidemiología',
 '#salud',
 '#salud',
 '#mortalidad',
 '#morbilidad',
 '#nutrición',
 '#enfermedad',
 '#enfermedadinfecciosa',
 '#ncd',
 '#notransmisible',
 '#enfermedadcontagiosa',
 '#transmisible',
 '#contaminacióndelaire',
 '#nutrición',
 '#desnutrición',
 '#trastornomental',
 '#retrasodelcrecimiento']
In [78]:
climate_dict
Out[78]:
['cambio_climático',
 'calentamiento_global',
 'temperatura',
 'clima_extremo',
 'cambio_ambiental_global',
 'variabilidad_climática',
 'invernadero',
 'bajo_carbono',
 'ghge',
 'energía_renovable',
 'emisiones_de_carbono',
 'emisiones_de_co2',
 'contaminantes_climáticos']
In [79]:
climate_hashtag_dict
Out[79]:
['#cambioclimático',
 '#calentamientoglobal',
 '#temperatura',
 '#climaextremo',
 '#cambioambientalglobal',
 '#variabilidadclimática',
 '#invernadero',
 '#bajocarbono',
 '#ghge',
 '#energíarenovable',
 '#emisionesdecarbono',
 '#emisionesdeco2',
 '#contaminantesclimáticos']

Analysing Tweets

In [80]:
location_threshold = 20
In [81]:
## Attempt to normalise location names by inferring the country when it's the last part of a location name.
## This is not used in the remaining of the notebook.

import re

place_country_regex = re.compile(r"^(?P<place>[^,]+?)[ ]*,[ ]*(?P<country>[^,$]+)[ \.]*$")

inferred_countries = {}
inferred_locations = {}

def infer_country(location):
    match = place_country_regex.match(location)
    if match:
        inferred_countries[match.group('place')] = match.group('country')
        inferred_countries[location] = match.group('country')

def get_inferred_location(location):
    if location in inferred_countries.keys():
        return inferred_countries[location]
    else:
        return location

for tweet in tweets_health.values():
    infer_country(tweet['location'])
    
for tweet in tweets_health.values():
    inferred_location = get_inferred_location(tweet['location'])
    inferred_locations[inferred_location] = inferred_locations.get(inferred_location, 0) + 1
In [82]:
locations = {}
dates = {}
sentiments = {}

for tweet in tweets_health.values():
    locations[tweet['location']] = locations.get(tweet['location'], 0) + 1
    dates[tweet['date']] = dates.get(tweet['date'], 0) + 1

    (polarity, subjectivity) = tweet['sentiment']
    sentiments[polarity] = sentiments.get(polarity, 0) + 1
    sentiments[subjectivity] = sentiments.get(subjectivity, 0) + 1
In [83]:
print("%d %d" % (len(locations.keys()), len(inferred_locations.keys())))
41083 31764
In [84]:
df_locations = pd.DataFrame(sorted(locations.items(), key=lambda k: k[1], reverse=True), columns=["Location", "Number of tweets"])
df_locations.loc[(df_locations["Number of tweets"] >= 100)]
Out[84]:
Location Number of tweets
0 not_specified 96523
1 Venezuela 6819
2 Comunidad de Madrid, España 4194
3 Argentina 3351
4 España 3149
5 Buenos Aires, Argentina 2886
6 Caracas, Venezuela 2851
7 México 2734
8 Chile 2490
9 Colombia 1884
10 Santiago, Chile 1703
11 Madrid 1433
12 Madrid, Comunidad de Madrid 1415
13 Panamá 1375
14 Ecuador 1336
15 Bogotá, D.C., Colombia 1333
16 Nicaragua 1240
17 Ciudad Autónoma de Buenos Aire 1131
18 Caracas 1130
19 Buenos Aires 969
20 Lima, Peru 930
21 Córdoba, Argentina 819
22 Mexico 753
23 venezuela 740
24 Barcelona 735
25 Montevideo, Uruguay 700
26 Paraguay 656
27 Madrid, España 617
28 Caracas - Venezuela 604
29 Rosario, Argentina 602
... ... ...
144 ESPAÑA 117
145 Asunción, Paraguay 117
146 Entre Ríos, Argentina 116
147 Managua, Nicaragua 115
148 Bucaramanga, Colombia 115
149 Mérida, Yucatán 114
150 Quilmes, Argentina 114
151 Argentina 114
152 Barquisimeto 112
153 santiago 111
154 MEXICO 110
155 CABA 110
156 Republica Dominicana 108
157 San Juan, Argentina 108
158 Ñuñoa, Chile 108
159 Buenos Aires - Argentina 107
160 Ciudad Guayana, Venezuela 107
161 Barinas - Venezuela 106
162 Córdoba 105
163 Cuenca, Ecuador 105
164 Ciudad Bolivar, Venezuela 104
165 Bahía Blanca, Argentina 104
166 Cancún, México 103
167 Maracay 103
168 México 102
169 Nuevo León, México 102
170 Providencia, Chile 102
171 Cojedes, Venezuela 101
172 Táchira, Venezuela 101
173 Quito - Ecuador 100

174 rows × 2 columns

In [85]:
df_dates = pd.DataFrame(sorted(dates.items(), key=lambda k: k[1], reverse=True), columns=["Date", "Number of tweets"])
df_dates
Out[85]:
Date Number of tweets
0 7 2018 283475
In [86]:
df_sentiments = pd.DataFrame(sorted(sentiments.items(), key=lambda k: k[1], reverse=True), columns=["Sentiment", "Number of tweets"])
df_sentiments
Out[86]:
Sentiment Number of tweets
0 no_polarity 283475
1 no_subjectivity 283475
In [87]:
from nltk.tokenize import MWETokenizer, TweetTokenizer
import re

tweet_tokenizer = TweetTokenizer()
tokenizer = MWETokenizer(compound_terms)

per_location = {}
average_per_location = {}
proportion_per_location = {}
per_date = {}
average_per_date = {}
proportion_per_date = {}
per_sentiment = {}
average_per_sentiment = {}
proportion_per_sentiment = {}
per_user = {}
proportion_per_user = {}
histogram_number_of_mentions = {}

global_count_health_keywords = {}
global_count_climate_keywords = {}
global_count_health = 0
global_count_intersection = 0

global_health_contexts = []
global_health_hashtag_contexts = []
global_health_user_mentions_contexts = []
global_intersection_contexts = []
global_intersection_hashtag_contexts = []
global_intersection_user_mentions_contexts = []

cooccurrence_matrix = {}
hashtags_cooccurrence_matrix = {}
mixed_cooccurrence_matrix = {}

for termset in ["health", "intersection"]:
    per_location[termset] = {}
    average_per_location[termset] = {}
    proportion_per_location[termset] = {}
    per_date[termset] = {}
    average_per_date[termset] = {}
    proportion_per_date[termset] = {}
    per_sentiment[termset] = {}
    average_per_sentiment[termset] = {}
    proportion_per_sentiment[termset] = {}
    per_user[termset] = {}
    proportion_per_user[termset] = {}
    histogram_number_of_mentions[termset] = {}
    
for tweet in tweets_health.values():
    text = tweet['text']
    hashtags = tweet['hashtags']
    location = tweet['location']
    date = tweet['date']
    (polarity, subjectivity) = tweet['sentiment']
    user  = tweet['user']
    user_mentions = tweet['user_mentions']
    
    wordlist = tweet_tokenizer.tokenize(text.lower())
    compounds_wordlist = tokenizer.tokenize(wordlist)
    filtered_compounds_wordlist = [w for w in compounds_wordlist if (len(w) > 3)]

    health_contexts = []
    total_intersection_mentions = 0
    total_health_mentions = 0
    
    health_words = []
    climate_words = []
    for word in filtered_compounds_wordlist:
        if word in health_dict:
            total_health_mentions += 1
            context = filtered_compounds_wordlist
            health_contexts.append(context)
            global_health_contexts.extend(context)
            global_health_hashtag_contexts.extend(hashtags)
            global_health_user_mentions_contexts.extend(user_mentions)
            global_count_health_keywords[word] = global_count_health_keywords.get(word, 0) + 1
            health_words.append(word)
            
        if word in climate_dict:  # means intersection, since we are processing health tweets here
            total_intersection_mentions += 1
            context = filtered_compounds_wordlist
            global_intersection_contexts.extend(context)
            global_intersection_hashtag_contexts.extend(hashtags)
            global_intersection_user_mentions_contexts.extend(user_mentions)
            global_count_climate_keywords[word] = global_count_climate_keywords.get(word, 0) + 1
            climate_words.append(word)
    
    if (len(health_words) > 0) and (len(climate_words) > 0): #means intersection
        for hword in health_words:
            if hword not in cooccurrence_matrix.keys():
                cooccurrence_matrix[hword] = {}
            if hword not in mixed_cooccurrence_matrix.keys():
                mixed_cooccurrence_matrix[hword] = {}
            for hashtag in hashtags:
                hashtag = hashtag.lower()            
                mixed_cooccurrence_matrix[hword][hashtag] = mixed_cooccurrence_matrix[hword].get(hashtag, 0) + 1            
            for cword in climate_words:
                cooccurrence_matrix[hword][cword] = cooccurrence_matrix[hword].get(cword, 0) + 1
                if cword not in mixed_cooccurrence_matrix.keys():
                    mixed_cooccurrence_matrix[cword] = {}
                for hashtag in hashtags:
                    hashtag = hashtag.lower()            
                    mixed_cooccurrence_matrix[cword][hashtag] = mixed_cooccurrence_matrix[cword].get(hashtag, 0) + 1

    if total_intersection_mentions > 0:
        for htag1 in hashtags:
            htag1 = htag1.lower()
            if htag1 not in hashtags_cooccurrence_matrix:
                hashtags_cooccurrence_matrix[htag1] = {}
            for htag2 in hashtags:
                htag2 = htag2.lower()
                if htag1 != htag2:
                    hashtags_cooccurrence_matrix[htag1][htag2] = hashtags_cooccurrence_matrix[htag1].get(htag2, 0) + 1
        
    if total_health_mentions == 0:
        # assuming all tweets collected using health keywords contain at least one health term
        total_health_mentions = 1
        context = filtered_compounds_wordlist
        health_contexts.append(context)
        global_health_contexts.extend(context)
        global_health_hashtag_contexts.extend(hashtags)
        global_health_user_mentions_contexts.extend(user_mentions)
    
    global_count_health += total_health_mentions
    global_count_intersection += total_intersection_mentions
   
    histogram_number_of_mentions["health"][total_health_mentions] = histogram_number_of_mentions["health"].get(total_health_mentions, 0) + 1
    histogram_number_of_mentions["intersection"][total_intersection_mentions] = histogram_number_of_mentions["intersection"].get(total_intersection_mentions, 0) + 1
    
    if locations[location] >= location_threshold:
        per_location["health"][location] = per_location["health"].get(location,0) + total_health_mentions
        per_location["intersection"][location] = per_location["intersection"].get(location,0) + total_intersection_mentions

    per_date["health"][date] = per_date["health"].get(date,0) + total_health_mentions
    per_date["intersection"][date] = per_date["intersection"].get(date,0) + total_intersection_mentions

    per_user["health"][user] = per_user["health"].get(user,0) + total_health_mentions
    per_user["intersection"][user] = per_user["intersection"].get(user,0) + total_intersection_mentions

    per_sentiment["health"][polarity] = per_sentiment["health"].get(polarity,0) + total_health_mentions
    per_sentiment["intersection"][polarity] = per_sentiment["intersection"].get(polarity,0) + total_intersection_mentions
    per_sentiment["health"][subjectivity] = per_sentiment["health"].get(subjectivity,0) + total_health_mentions
    per_sentiment["intersection"][subjectivity] = per_sentiment["intersection"].get(subjectivity,0) + total_intersection_mentions
    
    if total_health_mentions > 0:
        if locations[location] >= location_threshold:
            proportion_per_location["health"][location] = proportion_per_location["health"].get(location,0) + 1
        proportion_per_date["health"][date] = proportion_per_date["health"].get(date,0) + 1
        proportion_per_sentiment["health"][polarity] = proportion_per_sentiment["health"].get(polarity,0) + 1
        proportion_per_sentiment["health"][subjectivity] = proportion_per_sentiment["health"].get(subjectivity,0) + 1
        proportion_per_user["health"][user] = proportion_per_user["health"].get(user,0) + 1

    if total_intersection_mentions > 0:
        if locations[location] >= location_threshold:
            proportion_per_location["intersection"][location] = proportion_per_location["intersection"].get(location,0) + 1        
        proportion_per_date["intersection"][date] = proportion_per_date["intersection"].get(date,0) + 1        
        proportion_per_sentiment["intersection"][polarity] = proportion_per_sentiment["intersection"].get(polarity,0) + 1
        proportion_per_sentiment["intersection"][subjectivity] = proportion_per_sentiment["intersection"].get(subjectivity,0) + 1
        proportion_per_user["intersection"][user] = proportion_per_user["intersection"].get(user,0) + 1        

for location in locations.keys():
    if locations[location] >= location_threshold:
        average_per_location["health"][location] = per_location["health"][location]/locations[location]
        average_per_location["intersection"][location] = per_location["intersection"][location]/locations[location]

        proportion_per_location["health"][location] = proportion_per_location["health"].get(location,0)/locations[location] * 100
        proportion_per_location["intersection"][location] = proportion_per_location["intersection"].get(location,0)/locations[location] * 100

for date in dates.keys():
    average_per_date["health"][date] = per_date["health"][date]/dates[date]
    average_per_date["intersection"][date] = per_date["intersection"][date]/dates[date]

    proportion_per_date["health"][date] = proportion_per_date["health"].get(date,0)/dates[date] * 100
    proportion_per_date["intersection"][date] = proportion_per_date["intersection"].get(date,0)/dates[date] * 100

for sentiment in sentiments.keys():
    average_per_sentiment["health"][sentiment] = per_sentiment["health"][sentiment]/sentiments[sentiment]
    average_per_sentiment["intersection"][sentiment] = per_sentiment["intersection"][sentiment]/sentiments[sentiment]

    proportion_per_sentiment["health"][sentiment] = proportion_per_sentiment["health"].get(sentiment,0)/sentiments[sentiment] * 100
    proportion_per_sentiment["intersection"][sentiment] = proportion_per_sentiment["intersection"].get(sentiment,0)/sentiments[sentiment] * 100
    
for user in proportion_per_user["health"].keys():
    proportion_per_user["health"][user] = proportion_per_user["health"].get(user,0)/global_count_health * 100

for user in proportion_per_user["intersection"].keys():
    proportion_per_user["intersection"][user] = proportion_per_user["intersection"].get(user,0)/global_count_intersection * 100

Visualising the result of the text analysis

Histogram: Logarithm of number of mentions

Each bar corresponds to number of tweets with x number of mentions. There are no tweets with 0 (zero) health mentions since tweets were collected using the keywords so they occur at least once in each tweet.

In [88]:
df_histogram_number_of_mentions = pd.DataFrame(data=histogram_number_of_mentions)
df_histogram_number_of_mentions.plot.bar(logy=True, figsize=(20,5))
Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f35a727c048>

Network graphs of word/hashtag co-occurrence on intersection tweets

Blue nodes are health keywords, green nodes are climate keywords and red nodes are hashtags. The closer the nodes are to each other, the more often the words co-occur.

Network graph of co-occurrence of health and climate keywords

In [90]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np


G = nx.Graph()

health_nodes = []
climate_nodes = []

for word1 in cooccurrence_matrix.keys():
    for word2 in cooccurrence_matrix[word1].keys():
        if cooccurrence_matrix[word1][word2] > language_ref[language]['min_coocurrence']:
            G.add_edge(word1, word2, weight=cooccurrence_matrix[word1][word2])
            health_nodes.append(word1)
            climate_nodes.append(word2)

plt.figure(figsize=(15,15))

pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist=health_nodes, node_size=1000, node_color='b')
nx.draw_networkx_nodes(G, pos, nodelist=climate_nodes, node_size=1000, node_color='g')
nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=True), width=2)
nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')

plt.axis('off')
plt.show()

Network graph of co-occurrence of keywords (health or climate) and general hashtags

Only occurrences in intersection tweets are considered

In [91]:
G = nx.Graph()

health_nodes = []
climate_nodes = []
hashtag_nodes = []

for word in mixed_cooccurrence_matrix.keys():
    for hashtag in mixed_cooccurrence_matrix[word].keys():
        if mixed_cooccurrence_matrix[word][hashtag] > language_ref[language]['min_coocurrence']:
            G.add_edge(word, hashtag, weight=mixed_cooccurrence_matrix[word][hashtag])
            hashtag_nodes.append(hashtag)
            if word in health_dict:
                health_nodes.append(word)
            if word in climate_dict:
                climate_nodes.append(word)

plt.figure(figsize=(15,15))

pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist=health_nodes, node_size=1000, node_color='b')
nx.draw_networkx_nodes(G, pos, nodelist=climate_nodes, node_size=1000, node_color='g')
nx.draw_networkx_nodes(G, pos, nodelist=hashtag_nodes, node_size=1000, node_color='r')
nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=True), width=2)
nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')
#edge_labels = nx.get_edge_attributes(G,'weight')
#nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10, font_family='sans-serif')

plt.axis('off')
plt.show()

Network graph of co-occurrence of hashtags

Only occurrences in intersection tweets are considered

In [92]:
G = nx.Graph()

nodes = []
for htag1 in hashtags_cooccurrence_matrix.keys():
    for htag2 in hashtags_cooccurrence_matrix[htag1].keys():
        if hashtags_cooccurrence_matrix[htag1][htag2] > language_ref[language]['min_coocurrence_hashtags']:
            G.add_edge(htag1, htag2, weight=hashtags_cooccurrence_matrix[htag1][htag2])
            nodes.append(htag1)
            nodes.append(htag2)
        

plt.figure(figsize=(15,15))

pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist=nodes, node_size=1000)
nx.draw_networkx_edges(G, pos, edgelist=G.edges(data=True), width=2)
nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')
#edge_labels = nx.get_edge_attributes(G,'weight')
#nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10, font_family='sans-serif')

plt.axis('off')
plt.show()

Sentiment of tweets including relevant references per year: total

In [93]:
df_per_sentiment = pd.DataFrame(data=per_sentiment, index=["positive", "neutral", "negative", "objective", "subjective"])
df_per_sentiment
Out[93]:
health intersection
positive NaN NaN
neutral NaN NaN
negative NaN NaN
objective NaN NaN
subjective NaN NaN
In [94]:
df_per_sentiment.T.filter(items=["health"],axis=0).plot.bar(figsize=(20,5))
Out[94]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f35942f1048>
In [95]:
df_per_sentiment.T.filter(items=["intersection"],axis=0).plot.bar(figsize=(20,5))
Out[95]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f35b4ea6a90>

References per date: total, average, proportion

In [96]:
df_per_date = pd.DataFrame(data=per_date)
df_per_date
Out[96]:
health intersection
7 2018 288426 129
In [97]:
ax = df_per_date.plot.line(figsize=(15,5))
ax.set_xlabel("Date")
ax.set_ylabel("Total number of references")
ax
Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f359d0746a0>
In [98]:
df_average_per_date = pd.DataFrame(data=average_per_date)
df_average_per_date
Out[98]:
health intersection
7 2018 1.017465 0.000455
In [99]:
ax = df_average_per_date.plot.line(figsize=(15,5))
ax.set_xlabel("Date")
ax.set_ylabel("Average number of references")
ax
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f359b8aff28>
In [100]:
df_proportion_per_date = pd.DataFrame(data=proportion_per_date)
df_proportion_per_date
Out[100]:
health intersection
7 2018 100.0 0.045507
In [101]:
ax = df_proportion_per_date.plot.line(figsize=(15,5))
ax.set_xlabel("Date")
ax.set_ylabel("Proportion of tweets (%)")
ax.set_ylim(ymin=0)
ax
Out[101]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f35ac3235c0>

References per location: total, average, proportion

Due to the vast number of different locations defined by Twitter users, the tables show only locations for which there are references to health keywords, while the plots show only locations for which there are references to both health and climate keywords (intersection)

In [102]:
df_per_location = pd.DataFrame(data=per_location)
df_per_location.sort_values(by=['intersection'], ascending=False).head(100)
Out[102]:
health intersection
not_specified 98246 49
España 3194 6
Madrid 1453 4
Colombia 1914 3
Madrid, España 624 3
Madrid, Comunidad de Madrid 1425 3
Puerto Varas, Chile 33 2
México 2778 2
Barcelona, Cataluña 95 1
Santiago, Metropolitana de Santiago 73 1
Santiago de Compostela, España 28 1
Madrid (España) 37 1
Monterrey, Nuevo León 425 1
Santa Marta, Colombia 44 1
San Sebastián, España 39 1
Córdoba, Argentina 830 1
ESPAÑA 121 1
Extremadura, España 41 1
Estados Unidos 239 1
San José, Costa Rica 129 1
México, D.F. 164 1
Galicia 120 1
Bogotá, D.C., Colombia 1344 1
Gran Canaria 24 1
Tenerife 55 1
Osorno, Chile 44 1
Rosario, Argentina 608 1
Chile 2527 1
Buenos Aires, Argentina 2952 1
Cuautitlán Izcalli, México 23 1
... ... ...
Resistencia, Argentina 79 0
Pamplona 24 0
Pamplona, España 32 0
Pilar, Argentina 159 0
Piura, Peru 40 0
Quintana Roo, México 91 0
Punto Fijo, Venezuela 23 0
Puerto Rico 307 0
Quito-Ecuador 46 0
Puerto Rico, USA 105 0
Puerto Varas 28 0
Quito, Ecuador 527 0
Punta Arenas 22 0
Punta Arenas, Chile 66 0
Punto Fijo Falcon Venezuela 40 0
Querétaro 33 0
Puerto Ordaz, Venezuela 20 0
Quito Ecuador 43 0
Querétaro Arteaga, México 30 0
Querétaro, México 30 0
Querétaro, Querétaro Arteaga 69 0
Quevedo, Ecuador 29 0
Quilmes 37 0
Quilmes, Argentina 115 0
Quito - Ecuador 100 0
Puerto Ordáz, Venezuela 60 0
Puerto Ordaz 38 0
Región de Murcia 24 0
Rancagua 54 0
Planeta Tierra 118 0

100 rows × 2 columns

In [103]:
ax = df_per_location.sort_values(by=['intersection'], ascending=False).head(50).plot.bar(stacked=True,figsize=(15,5), logy=True)
ax.set_xlabel("Location")
ax.set_ylabel("Total number of references")
ax
Out[103]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f359bedb2e8>
In [104]:
df_average_per_location = pd.DataFrame(data=average_per_location)
df_average_per_location.sort_values(by=['intersection'], ascending=False).head(100)
Out[104]:
health intersection
Puerto Varas, Chile 1.000000 0.060606
Cuautitlán Izcalli, México 1.000000 0.043478
Gran Canaria 1.000000 0.041667
Santiago de Compostela, España 1.000000 0.035714
Madrid (España) 1.027778 0.027778
San Sebastián, España 1.000000 0.025641
Extremadura, España 1.025000 0.025000
Santa Marta, Colombia 1.047619 0.023810
Osorno, Chile 1.000000 0.022727
Tenerife 1.037736 0.018868
Santiago, Metropolitana de Santiago 1.000000 0.013699
Barcelona, Cataluña 1.000000 0.010526
ESPAÑA 1.034188 0.008547
Galicia 1.016949 0.008475
San José, Costa Rica 1.000000 0.007752
México, D.F. 1.006135 0.006135
Madrid, España 1.011345 0.004862
Estados Unidos 1.012712 0.004237
Madrid 1.013957 0.002791
Monterrey, Nuevo León 1.011905 0.002381
Madrid, Comunidad de Madrid 1.007067 0.002120
España 1.014290 0.001905
Rosario, Argentina 1.009967 0.001661
Colombia 1.015924 0.001592
Córdoba, Argentina 1.013431 0.001221
Bogotá, D.C., Colombia 1.008252 0.000750
México 1.016094 0.000732
not_specified 1.017851 0.000508
Chile 1.014859 0.000402
Buenos Aires, Argentina 1.022869 0.000347
... ... ...
Palma, España 1.000000 0.000000
Resistencia, Argentina 1.039474 0.000000
Pamplona 1.000000 0.000000
Pilar, Argentina 1.169118 0.000000
Piura 1.000000 0.000000
Piura, Peru 1.025641 0.000000
Punto Fijo Falcon Venezuela 1.000000 0.000000
Puerto Ordáz, Venezuela 1.000000 0.000000
Puerto Rico 1.054983 0.000000
Quito, Ecuador 1.005725 0.000000
Puerto Rico, USA 1.093750 0.000000
Puerto Varas 1.037037 0.000000
Quito Ecuador 1.023810 0.000000
Punta Arenas 1.000000 0.000000
Punta Arenas, Chile 1.000000 0.000000
Punto Fijo, Venezuela 1.000000 0.000000
Puerto Ordaz 1.027027 0.000000
Querétaro 1.000000 0.000000
Quito - Ecuador 1.000000 0.000000
Querétaro Arteaga, México 1.000000 0.000000
Querétaro, México 1.000000 0.000000
Querétaro, Querétaro Arteaga 1.014706 0.000000
Quevedo, Ecuador 1.035714 0.000000
Quilmes 1.000000 0.000000
Quilmes, Argentina 1.008772 0.000000
Puerto Ordaz, Venezuela 1.000000 0.000000
Puerto Montt, Chile 1.000000 0.000000
Región de Murcia 1.000000 0.000000
Rancagua 1.058824 0.000000
Planeta Tierra 1.008547 0.000000

100 rows × 2 columns

In [105]:
ax = df_average_per_location.sort_values(by=['intersection'], ascending=False).head(50).plot.bar(stacked=True,figsize=(15,5))
ax.set_xlabel("Location")
ax.set_ylabel("Average number of references")
ax
Out[105]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f35a4059240>
In [106]:
df_proportion_per_location = pd.DataFrame(data=proportion_per_location)
df_proportion_per_location.sort_values(by=['intersection'], ascending=False).head(100)
Out[106]:
health intersection
Puerto Varas, Chile 100.0 6.060606
Cuautitlán Izcalli, México 100.0 4.347826
Gran Canaria 100.0 4.166667
Santiago de Compostela, España 100.0 3.571429
Madrid (España) 100.0 2.777778
San Sebastián, España 100.0 2.564103
Extremadura, España 100.0 2.500000
Santa Marta, Colombia 100.0 2.380952
Osorno, Chile 100.0 2.272727
Tenerife 100.0 1.886792
Santiago, Metropolitana de Santiago 100.0 1.369863
Barcelona, Cataluña 100.0 1.052632
ESPAÑA 100.0 0.854701
Galicia 100.0 0.847458
San José, Costa Rica 100.0 0.775194
México, D.F. 100.0 0.613497
Madrid, España 100.0 0.486224
Estados Unidos 100.0 0.423729
Madrid 100.0 0.279135
Monterrey, Nuevo León 100.0 0.238095
Madrid, Comunidad de Madrid 100.0 0.212014
España 100.0 0.190537
Rosario, Argentina 100.0 0.166113
Colombia 100.0 0.159236
Córdoba, Argentina 100.0 0.122100
Bogotá, D.C., Colombia 100.0 0.075019
México 100.0 0.073153
not_specified 100.0 0.050765
Chile 100.0 0.040161
Buenos Aires, Argentina 100.0 0.034650
... ... ...
Palma, España 100.0 0.000000
Resistencia, Argentina 100.0 0.000000
Pamplona 100.0 0.000000
Pilar, Argentina 100.0 0.000000
Piura 100.0 0.000000
Piura, Peru 100.0 0.000000
Punto Fijo Falcon Venezuela 100.0 0.000000
Puerto Ordáz, Venezuela 100.0 0.000000
Puerto Rico 100.0 0.000000
Quito, Ecuador 100.0 0.000000
Puerto Rico, USA 100.0 0.000000
Puerto Varas 100.0 0.000000
Quito Ecuador 100.0 0.000000
Punta Arenas 100.0 0.000000
Punta Arenas, Chile 100.0 0.000000
Punto Fijo, Venezuela 100.0 0.000000
Puerto Ordaz 100.0 0.000000
Querétaro 100.0 0.000000
Quito - Ecuador 100.0 0.000000
Querétaro Arteaga, México 100.0 0.000000
Querétaro, México 100.0 0.000000
Querétaro, Querétaro Arteaga 100.0 0.000000
Quevedo, Ecuador 100.0 0.000000
Quilmes 100.0 0.000000
Quilmes, Argentina 100.0 0.000000
Puerto Ordaz, Venezuela 100.0 0.000000
Puerto Montt, Chile 100.0 0.000000
Región de Murcia 100.0 0.000000
Rancagua 100.0 0.000000
Planeta Tierra 100.0 0.000000

100 rows × 2 columns

In [107]:
ax = df_proportion_per_location.sort_values(by=['intersection'], ascending=False).head(50).plot.bar(figsize=(15,5))
ax.set_xlabel("Location")
ax.set_ylabel("Proportion of tweets (%)")
ax
Out[107]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f35b2f2aa90>

References per sender (Twitter username): total and proportion

In [108]:
df_per_user = pd.DataFrame(data=per_user)
df_per_user.sort_values(by=['intersection'], ascending=False).head(50)
Out[108]:
health intersection
@Ayto_Guadarrama 2 2
@Hestia75382213 3 2
@ConcepcinArrie2 4 1
@pilarynes71 2 1
@daniambiental 1 1
@mariaje1956 2 1
@auroramg2012 2 1
@ted_bundy2512 4 1
@IldefonsoM 14 1
@fotosvdea 1 1
@guillepmillan 1 1
@DoradoAlex 1 1
@Lukalofonos 4 1
@animo_pollitos 1 1
@mafgodino 1 1
@isandaluz2 2 1
@PabloVa35148977 3 1
@_Leti_Ac 1 1
@InfobaeTrends 61 1
@javier_noguera 3 1
@prevencionistas 5 1
@MarthaPerez 4 1
@DoradoGabriel 1 1
@Nicocarw 1 1
@dr_xeo 1 1
@Ricardo64373070 9 1
@GuadarramaNews 1 1
@imv1977 1 1
@danielsenderos 2 1
@FondoAdaptacion 2 1
@Davidcaton77 2 1
@Colibri321 2 1
@CRCiencia 2 1
@OSMAN_salud 3 1
@ale_ramsss 2 1
@puerta_bonita 2 1
@cfresneda1 1 1
@agencia_sinc 1 1
@idisalud 5 1
@Galef17 2 1
@Telenordcomdo 3 1
@MAmbientalistaC 3 1
@WASHITENE 1 1
@hoytufuturo 35 1
@JLjuag 1 1
@AytCiempozuelos 2 1
@COFMadrid 2 1
@jasanchez 1 1
@thomasmmr 3 1
@prisvilli 1 1
In [109]:
df_proportion_per_user = pd.DataFrame(data=proportion_per_user)
df_proportion_per_user.sort_values(by=['intersection'], ascending=False).head(50)
Out[109]:
health intersection
@Hestia75382213 0.001040 1.550388
@Ayto_Guadarrama 0.000693 1.550388
@112surem 0.000347 0.775194
@hoytufuturo 0.012135 0.775194
@fotosvdea 0.000347 0.775194
@froditamankell 0.000693 0.775194
@fuente_monica 0.000347 0.775194
@gaby_ma99 0.000347 0.775194
@guillepmillan 0.000347 0.775194
@imagrosa 0.001040 0.775194
@idisalud 0.001734 0.775194
@dr_xeo 0.000347 0.775194
@imv1977 0.000347 0.775194
@isahuger 0.000693 0.775194
@isandaluz2 0.000693 0.775194
@jacresposomolin 0.000347 0.775194
@eduarbike 0.000347 0.775194
@disfrutalacienc 0.003467 0.775194
@jaime_bara 0.000693 0.775194
@delaconcharex 0.000347 0.775194
@danielsenderos 0.000693 0.775194
@daniambiental 0.000347 0.775194
@cristian_forest 0.000347 0.775194
@climaNoticias 0.000347 0.775194
@chuchoaguilar9 0.000693 0.775194
@cfresneda1 0.000347 0.775194
@bertosabino 0.000693 0.775194
@auroramg2012 0.000693 0.775194
@annajuanroch 0.001387 0.775194
@animo_pollitos 0.000347 0.775194
@ale_ramsss 0.000693 0.775194
@aios30 0.000347 0.775194
@jade94779164 0.001040 0.775194
@jasanchez 0.000347 0.775194
@afecspain 0.000347 0.775194
@javier_noguera 0.001040 0.775194
@vmontielarquero 0.001387 0.775194
@viacum 0.000347 0.775194
@tinomarting 0.000347 0.775194
@thomasmmr 0.001040 0.775194
@teukrion 0.000347 0.775194
@ted_bundy2512 0.001387 0.775194
@sstefaniaruiz 0.000347 0.775194
@soyvictorbalam 0.000347 0.775194
@sdelavargaglez 0.000347 0.775194
@salazio_ 0.000347 0.775194
@rossyrox47 0.000347 0.775194
@puerta_bonita 0.000693 0.775194
@prisvilli 0.000347 0.775194
@primeralluvia 0.001734 0.775194
In [110]:
ax = df_proportion_per_user.sort_values(by=['intersection'], ascending=False).head(50).plot.bar(figsize=(15,5))
ax.set_xlabel("User")
ax.set_ylabel("Proportion of tweets (%)")
ax
Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f35a63c3898>

Frenquency of keywords: health and climate

In [111]:
df_health_keywords = pd.DataFrame(sorted(global_count_health_keywords.items(), key=lambda k: k[1], reverse=True), columns=["Keyword", "Number of mentions"])
df_health_keywords
Out[111]:
Keyword Number of mentions
0 salud 119792
1 enfermedad 18920
2 salud_pública 4796
3 desnutrición 4458
4 sarampión 3718
5 infección 2539
6 epidemia 2426
7 nutrición 2193
8 mortalidad 1879
9 diarrea 1834
10 malaria 1003
11 neumonía 580
12 pandemia 417
13 contaminación_del_aire 350
14 trastorno_mental 180
15 epidemiología 90
16 enfermedad_contagiosa 73
17 morbilidad 55
18 enfermedad_infecciosa 22
19 transmisible 5
20 sars 4
21 no_transmisible 2
In [112]:
df_climate_keywords = pd.DataFrame(sorted(global_count_climate_keywords.items(), key=lambda k: k[1], reverse=True), columns=["Keyword", "Number of mentions"])
df_climate_keywords
Out[112]:
Keyword Number of mentions
0 temperatura 72
1 cambio_climático 54
2 calentamiento_global 2
3 invernadero 1

Word clouds: health, climate, intersection

Up to 200 most frequent words that appear in the context of our health or climate keywords or both

In [113]:
import matplotlib.pyplot as plt
import collections
from wordcloud import WordCloud, STOPWORDS
from stop_words import get_stop_words

threshold = 200

language_specific_stopwords = get_stop_words(language)

def create_wordcloud(contexts, stopwords=[]):
    most_frequent_words = {}
    stopwords.extend(STOPWORDS)
    stopwords.extend(language_specific_stopwords)

    context_unigrams = collections.Counter(contexts)
    for word, freq in sorted(context_unigrams.items(), key=lambda k: k[1], reverse=True)[0:threshold]:
        if word not in stopwords:
            most_frequent_words[word] = freq

    wordcloud = WordCloud(background_color="white", scale=10).generate_from_frequencies(most_frequent_words)

    fig = plt.figure(1, figsize=(20, 12))
    plt.axis('off')
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()

Contexts for health keywords

In [114]:
create_wordcloud(global_health_contexts, health_dict + ['https'])

Contexts for intersection tweets (excludes health and climate keywords)

In [115]:
create_wordcloud(global_intersection_contexts, climate_dict + health_dict + ["https"])

Contexts for intersection tweets (includes the found health and climate keywords)

In [116]:
create_wordcloud(global_intersection_contexts, ["https"])

Hashtags that appear in tweets containing health keywords

In [117]:
lowercase_hashtags = []
for hashtag in global_health_hashtag_contexts:
    lowercase_hashtags.append(hashtag.lower())
create_wordcloud(lowercase_hashtags, health_hashtag_dict + ["https"])

Hashtags that appear in tweets containing the intersection of health and climate keywords

In [118]:
lowercase_hashtags = []
for hashtag in global_intersection_hashtag_contexts:
    lowercase_hashtags.append(hashtag.lower())
create_wordcloud(lowercase_hashtags, ["https"])

User mentions that appear in tweets containing health keywords

In [119]:
create_wordcloud(global_health_user_mentions_contexts)

User mentions that appear in tweets containing the intersection of health and climate keywords

In [120]:
create_wordcloud(global_intersection_user_mentions_contexts)